library(here)
setwd(here::here())
library(dplyr)
library(ggplot2)
library(tidyr)
library(purrr)
library(xts)
output_file <- "data/cleaned_etfs.csv"
df <- read.csv(output_file)
df$Date <- as.Date(df[, 1])  
data_xts <- xts(df[, -1], order.by = df$Date)
source("src/stock_list.R")
source("src/generate_dataset.R")
begin_date <- as.Date("2010-01-01")
end_date <- as.Date("2024-05-01")
output_file <- "data/cleaned_etfs.csv"

generate_dataset(stock_namelist, begin_date, end_date, output_file)
source("src/func_partial_ci.R")
# load csv created in the chunk above

# Define the ticker you want to fit
stock_tickers <- colnames(data_xts)

# Crea le combinazioni di coppie da stimare
stock_pairs <- combn(stock_tickers, 2, simplify = FALSE)

# Parametri di rolling
estimation_years <- 3
rolling_step_months <- 6
save_dir <- "results/fit"

# Esegui il backtest rolling
run_partial_ci_backtest(stock_pairs, data_xts, estimation_years, rolling_step_months, save_dir)
source("src/filtering_func.R")

results_folder <- "results/fit"
save_dir <- "results/pairs"

# Filter parameters
rho_min <- 0.9
rho_max <- 0.98
rsq_min <- 0.9
loglik_max <- 0

for (year in 2013:2024) {
  for (half in c("H1", "H2")) {
    process_period(year, half,
                   results_folder = results_folder,
                   rho_min = rho_min,
                   rho_max = rho_max,
                   rsq_min = rsq_min,
                   loglik_max = loglik_max,
                   save_dir = save_dir)
  }
}
##  Saved filtered pairs for 2013_H1 to results/pairs/pairs_2013_H1.RData 
##  Saved filtered pairs for 2013_H2 to results/pairs/pairs_2013_H2.RData 
##  Saved filtered pairs for 2014_H1 to results/pairs/pairs_2014_H1.RData 
##  Saved filtered pairs for 2014_H2 to results/pairs/pairs_2014_H2.RData 
##  Saved filtered pairs for 2015_H1 to results/pairs/pairs_2015_H1.RData 
##  Saved filtered pairs for 2015_H2 to results/pairs/pairs_2015_H2.RData 
##  Saved filtered pairs for 2016_H1 to results/pairs/pairs_2016_H1.RData 
##  Saved filtered pairs for 2016_H2 to results/pairs/pairs_2016_H2.RData 
##  Saved filtered pairs for 2017_H1 to results/pairs/pairs_2017_H1.RData 
##  Saved filtered pairs for 2017_H2 to results/pairs/pairs_2017_H2.RData 
##  Saved filtered pairs for 2018_H1 to results/pairs/pairs_2018_H1.RData 
##  Saved filtered pairs for 2018_H2 to results/pairs/pairs_2018_H2.RData 
##  Saved filtered pairs for 2019_H1 to results/pairs/pairs_2019_H1.RData 
##  Saved filtered pairs for 2019_H2 to results/pairs/pairs_2019_H2.RData 
##  Saved filtered pairs for 2020_H1 to results/pairs/pairs_2020_H1.RData 
##  Saved filtered pairs for 2020_H2 to results/pairs/pairs_2020_H2.RData 
##  Saved filtered pairs for 2021_H1 to results/pairs/pairs_2021_H1.RData 
##  Saved filtered pairs for 2021_H2 to results/pairs/pairs_2021_H2.RData 
##  Saved filtered pairs for 2022_H1 to results/pairs/pairs_2022_H1.RData 
##  Saved filtered pairs for 2022_H2 to results/pairs/pairs_2022_H2.RData 
##  Saved filtered pairs for 2023_H1 to results/pairs/pairs_2023_H1.RData 
##  Saved filtered pairs for 2023_H2 to results/pairs/pairs_2023_H2.RData 
##  Saved filtered pairs for 2024_H1 to results/pairs/pairs_2024_H1.RData 
##  File not found: results/fit/res_2024_H2.RData
pairs_dir <- "results/pairs/"

#  list all the pairs
pair_files <- list.files(pairs_dir, pattern = "^pairs_.*\\.RData$", full.names = TRUE)

all_pairs <- list()

for (file in pair_files) {
  temp_env <- new.env()
  load(file, envir = temp_env)
  var_name <- ls(temp_env)[grepl("^pairs_", ls(temp_env))]
  pairs <- get(var_name, envir = temp_env)
  all_pairs[[gsub("pairs_|\\.RData", "", basename(file))]] <- pairs
}

# Rbind all the pairs in a unique dataset
pairs_df <- do.call(rbind, lapply(names(all_pairs), function(period) {
  do.call(rbind, lapply(all_pairs[[period]], function(pair) {
    data.frame(period = period, stock_a = pair[1], stock_b = pair[2])
  }))
}))
# Count the number of selected pairs for each period
pair_counts <- pairs_df %>%
  group_by(period) %>%
  summarise(num_pairs = n())

ggplot(pair_counts, aes(x = period, y = num_pairs)) +
  geom_col(fill = "steelblue") +
  theme_minimal() +
  labs(title = " Number of Selected Pairs per Period",
       x = "Period",
       y = "# Pairs") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#  Crea nome univoco per ogni coppia (ordine alfabetico per evitare duplicati invertiti)
pairs_df <- pairs_df %>%
  mutate(pair = paste(pmin(stock_a, stock_b), pmax(stock_a, stock_b), sep = " - "))

#  Conta le ricorrenze per ciascuna coppia
top_pairs <- pairs_df %>%
  count(pair, sort = TRUE) %>%
  top_n(10, n)

# ️ Grafico a barre
ggplot(top_pairs, aes(x = reorder(pair, n), y = n)) +
  geom_col(fill = "darkorange") +
  coord_flip() +
  theme_minimal() +
  labs(title = " Most Frequent PCI Pairs Across Periods",
       x = "Pair",
       y = "Number of Periods Cointegrated")

source("src/plots_func.R")

walk(top_pairs$pair, function(p) {
  tickers <- unlist(strsplit(p, " - "))
  print(plot_pair_log_price_change(data_xts, tickers[1], tickers[2],
                                   start_date = "2014-01-01", end_date = "2024-01-01"))
})